#|output: false
#|cache: false
#|cache.lazy: false
source("utils.R")
load("data/06.doc_bathy_ann_pred.Rdata")Analyse predictons of annual bathypelagic DOC
Model evaluation, interpretation & projections
Set-up and load data
Model evaluation
Rsquares
Black dots on the R² boxplots show the actual values.
# Unnest predictions
preds <- res %>% select(fold, cv_type, preds) %>% unnest(preds)
# Compute Rsquare for each fold
rsquares <- preds %>%
group_by(cv_type, fold) %>%
rsq(truth = log_doc_bathy, estimate = .pred)
# Distribution of Rsquares by CV type
rsquares %>% split(.$cv_type) %>% map(summary)$stratified
cv_type fold .metric .estimator
Length:10 Length:10 Length:10 Length:10
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
.estimate
Min. :0.6934
1st Qu.:0.7382
Median :0.7701
Mean :0.7681
3rd Qu.:0.7960
Max. :0.8408
# Plot Rsquares values
ggplot(rsquares) +
geom_boxplot(aes(x = cv_type, y = .estimate, group = cv_type, colour = cv_type)) +
geom_jitter(aes(x = cv_type, y = .estimate), size = 0.5, width = 0.1) +
scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
labs(x = "CV type", y = "R²", colour = "CV type")Predictions VS truth
Plot pred VS truth on the test part of each fold.
preds %>%
ggplot() +
geom_point(aes(x = log_doc_bathy, y = .pred, colour = cv_type)) +
geom_abline(intercept = 0, slope = 1, colour = "red") +
coord_fixed() +
facet_wrap(cv_type~fold)Now let’s focus on a representative fold.
# Find the one closer to the median and plot it
repres_fold <- rsquares %>%
group_by(cv_type) %>%
mutate(diff = abs(.estimate - median(.estimate))) %>%
filter(diff == min(diff)) %>%
slice_head(n = 1)
repres_fold %>%
select(cv_type, fold) %>%
left_join(preds, by = join_by(cv_type, fold)) %>%
ggplot() +
geom_point(aes(x = log_doc_bathy, y = .pred, colour = cv_type)) +
geom_abline(intercept = 0, slope = 1, colour = "red") +
coord_fixed() +
labs(title = "Pred VS truth for a representative fold") +
facet_grid(~cv_type)Model interpretation
Variable importance
Variable importance for each fold.
# Unnest variable importance
full_vip <- res %>%
select(cv_type, fold, importance) %>%
unnest(importance) %>%
mutate(variable = forcats::fct_reorder(variable, dropout_loss))
# Get RMSE for full model
full_rmse <- full_vip %>%
filter(variable == "_full_model_") %>%
select(cv_type, fold, full_dropout_loss = dropout_loss) %>%
distinct()
# Compute RMSE increase between loss and full model loss
full_vip %>%
filter(!str_starts(variable, "_")) %>% # Keep only variables of interest
# Join with full model RMSE
left_join(full_rmse, by = join_by(cv_type, fold), relationship = "many-to-many") %>%
# Compute difference
mutate(diff_loss = dropout_loss - full_dropout_loss) %>%
ggplot() +
geom_vline(xintercept = 0, linewidth = 2, colour = "grey") +
geom_boxplot(aes(x = diff_loss, y = variable, colour = cv_type)) +
labs(x = "RMSE increase", y = "Variable", colour = "CV type") +
facet_grid(fold~cv_type)Now let’s take the mean across folds.
# Compute RMSE increase between loss and full model loss
full_vip %>%
filter(!str_starts(variable, "_")) %>% # Keep only variables of interest
# Join with full model RMSE
left_join(full_rmse, by = join_by(cv_type, fold), relationship = "many-to-many") %>%
# Compute difference
mutate(diff_loss = dropout_loss - full_dropout_loss) %>%
group_by(cv_type, fold, variable) %>%
summarise(diff_loss = mean(diff_loss), .groups = "drop") %>%
ggplot() +
geom_vline(xintercept = 0, linewidth = 2, colour = "grey") +
geom_boxplot(aes(x = diff_loss, y = variable, colour = cv_type)) +
labs(x = "RMSE increase", y = "Variable", colour = "CV type")Partial dependence plots
Finally, let’s have a look at partial dependence plots.
blue line: prediction mean across cp profiles
grey ribbon: prediction sd across centered cp profiles
# Variables for which to plot pdp
n_pdp <- 3
vars_pdp <- full_vip %>%
filter(variable != "_full_model_") %>%
mutate(variable = as.character(variable)) %>%
group_by(cv_type, variable) %>%
summarise(dropout_loss = mean(dropout_loss), .groups = "drop") %>%
arrange(desc(dropout_loss)) %>%
group_by(cv_type) %>%
slice_head(n = n_pdp)
# Unnest cp_profiles
cp_profiles <- res %>% select(cv_type, fold, cp_profiles) %>% unnest(cp_profiles)
## Let’s generate averaged cp profile across folds for each cv-type and propagating uncertainties.
## The difficulty is that x values differ between each fold, the solution is to interpolate yhat on a common set of x values across folds.
## Steps as follows for each cv_type and each variable
## 1- compute the mean and spread of cp profiles within each fold
## 2- interpolate yhat value and spread within each fold using a common set of x values
## 3- perform a weighted average of yhat value and spread, using 1/var as weights
# Get names of folds, for later use
folds <- sort(unique(full_vip$fold))
# Apply on each cv_type and variable
mean_pdp <- lapply(1:nrow(vars_pdp), function(r){
# Get variable and cvtype
var_name <- vars_pdp[r,]$variable
cv_type_name <- vars_pdp[r,]$cv_type
## Get corresponding CP profiles, compute mean and spread for each fold (step 1)
d_pdp <- cp_profiles %>%
filter(cv_type == cv_type_name & `_vname_` == var_name) %>%
select(cv_type, fold, `_yhat_`, `_vname_`, `_ids_`, all_of(var_name)) %>%
arrange(`_ids_`, across(all_of(var_name))) %>%
# center each cp profiles across fold, variable and ids
group_by(cv_type, fold, `_vname_`, `_ids_`) %>%
mutate(yhat_cent = `_yhat_` - mean(`_yhat_`)) %>% # center cp profiles
ungroup() %>%
# compute mean and sd of centered cp profiles for each fold and value of the variable of interest
group_by(cv_type, fold, across(all_of(var_name))) %>%
summarise(
yhat_loc = mean(`_yhat_`), # compute mean of profiles
yhat_spr = sd(yhat_cent), # compute sd of cp profiles
.groups = "keep"
) %>%
ungroup() %>%
setNames(c("cv_type", "fold", "x", "yhat_loc", "yhat_spr"))
## Interpolate yhat values and spread on a common x distribution (step 2)
# Regularise across folds: need a common x distribution, and interpolate y on this new x
new_x <- quantile(d_pdp$x, probs = seq(0, 1, 0.01), names = FALSE)
# x is different within each fold, so interpolation is performed on each fold
int_pdp <- lapply(1:length(folds), function(i){
# Get data corresponding to this fold
fold_name <- folds[i]
this_fold <- d_pdp %>% filter(fold == fold_name)
# Extract original x values
x <- this_fold$x
# Extract values to interpolate (yhat_loc and yhat_spr)
yhat_loc <- this_fold$yhat_loc
yhat_spr <- this_fold$yhat_spr
# Interpolate yhat_loc and yhat_spr on new x values
int <- tibble(
x = new_x,
yhat_loc = castr::interpolate(x = x, y = yhat_loc, xout = new_x),
yhat_spr = castr::interpolate(x = x, y = yhat_spr, xout = new_x),
) %>%
mutate(
cv_type = cv_type_name,
fold = fold_name,
var_name = var_name,
.before = x
)
# Return the result
return(int)
}) %>%
bind_rows()
## Across fold, compute the weighted mean, using 1/var as weights (step 3)
mean_pdp <- int_pdp %>%
group_by(cv_type, var_name, x) %>%
summarise(
yhat_loc = wtd.mean(yhat_loc, weights = 1/(yhat_spr)^2),
yhat_spr = wtd.mean(yhat_spr, weights = 1/(yhat_spr)^2),
.groups = "drop"
) %>%
arrange(x)
# Return the result
return(mean_pdp)
}) %>%
bind_rows()
# Arrange in order of most important variables
mean_pdp <- vars_pdp %>%
rename(var_name = variable) %>%
left_join(mean_pdp, by = join_by(cv_type, var_name)) %>%
mutate(var_name = fct_inorder(var_name)) %>%
select(-dropout_loss)
# Plot it!
ggplot(mean_pdp) +
geom_path(aes(x = x, y = yhat_loc, colour = cv_type)) +
geom_ribbon(aes(x = x, ymin = yhat_loc - yhat_spr, ymax = yhat_loc + yhat_spr, fill = cv_type), alpha = 0.2) +
facet_wrap(~var_name, scales = "free_x")New predictions
Collect predictions
# Unnest new predictions (i.e. projections)
new_preds <- res %>%
select(fold, cv_type, new_preds) %>%
unnest(new_preds) %>%
# Apply exp to predictions as we predicted log(doc)
mutate(pred_doc = exp(pred_doc_log), .after = pred_doc_log) %>%
select(cv_type, fold, lon, lat, contains("doc"))
# Get those from stratified CV
new_preds_strat <- new_preds %>% filter(cv_type == "stratified")
# Join projections with R² value for each fold and each cv_type.
new_preds_strat <- new_preds_strat %>% left_join(rsquares %>% select(cv_type, fold, rsq = .estimate), by = join_by(cv_type, fold))
## Average by pixel
# Stratified
strat_proj <- new_preds_strat %>%
group_by(lon, lat) %>%
summarise(
doc_avg = wtd.mean(pred_doc, weights = rsq, na.rm = TRUE),
doc_sd = sqrt(wtd.var(pred_doc, weights = rsq, na.rm = TRUE)),
.groups = "drop"
)
# Generate common colour bar limits for both CV types
doc_avg_lims <- c(
min(c(strat_proj$doc_avg)),
max(c(strat_proj$doc_avg))
)
doc_sd_lims <- c(
min(c(strat_proj$doc_sd)),
max(c(strat_proj$doc_sd))
)Maps
Stratified CV
ggplot(strat_proj) +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(aes(x = lon, y = lat, fill = doc_avg)) +
ggplot2::scale_fill_viridis_c(option = "F", limits = doc_avg_lims, trans = "log1p") +
labs(title = "DOC avg from stratified CV") +
coord_quickmap(expand = 0)ggplot(strat_proj) +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(aes(x = lon, y = lat, fill = doc_sd)) +
ggplot2::scale_fill_viridis_c(option = "E", limits = doc_sd_lims, trans = "log1p") +
labs(title = "DOC sd from stratified CV") +
coord_quickmap(expand = 0)Look at variations across CV folds.
new_preds_strat %>%
# Add information of R² to cv name
mutate(fold = paste0(fold, " (R² = ", percent(rsq, accuracy = 0.1), ")")) %>%
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(aes(x = lon, y = lat, fill = pred_doc)) +
ggplot2::scale_fill_viridis_c(option = "F", limits = doc_avg_lims, trans = "log1p") +
labs(title = "DOC pred across stratified CV folds") +
coord_quickmap(expand = 0) +
facet_wrap(~fold)Values in the North Pacific
# Read data from the north pacific
# https://www.naturalearthdata.com/downloads/10m-physical-vectors/10m-physical-labels/
n_pac <- sf::read_sf("data/raw/ne_10m_geography_marine_polys/ne_10m_geography_marine_polys.shp") %>%
filter(featurecla == "ocean") %>%
filter(name == "North Pacific Ocean") %>%
dplyr::select(name, geometry)
# Convert predicted points to sf and set crs
point.sf <- st_as_sf(strat_proj, coords = c("lon","lat"))
st_crs(point.sf) <- st_crs(n_pac$geometry)
# Filter points within the Pacific Ocean and extract geometry
preds_n_pac <- st_filter(point.sf, n_pac$geometry) %>%
tidyr::extract(geometry, c('lon', 'lat'), '\\((.*), (.*)\\)', convert = TRUE)
ggplot(preds_n_pac) +
geom_tile(aes(x = lon, y = lat, colour = doc_avg, fill = doc_avg)) +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
ggplot2::scale_fill_viridis_c(option = "F", trans = "log1p") +
ggplot2::scale_colour_viridis_c(option = "F", trans = "log1p", guide = "none") +
coord_quickmap(expand = 0)summary(preds_n_pac) doc_avg doc_sd lon lat
Min. :37.63 Min. :0.02973 Min. :-179.50 Min. : 0.50
1st Qu.:38.06 1st Qu.:0.07676 1st Qu.:-152.50 1st Qu.:10.25
Median :38.52 Median :0.11395 Median :-126.50 Median :22.50
Mean :38.59 Mean :0.16866 Mean : -41.03 Mean :23.82
3rd Qu.:38.70 3rd Qu.:0.23539 3rd Qu.: 151.50 3rd Qu.:36.50
Max. :42.90 Max. :1.31048 Max. : 179.50 Max. :57.50